# --------------------------------
# setup
# --------------------------------

# libraries
library(dplyr)
library(lubridate)
library(topicmodels)
library(ggplot2)
library(tidytext)

# --------------------------------
# load pre-trained topic models
# --------------------------------
tm <- readRDS("data/outputs/tm_165.rds")
document_dates <- readRDS("data/outputs/document_dates.rds")
document_dates <- document_dates %>% mutate(year = year(date), date_by_month = format(date, "%Y-%m"))

# --------------------------------
# figure A.5 (a)
# --------------------------------

# per document topic proabilities matrix (gamma)
doc_topics <- tidy(tm, matrix = "gamma") %>% arrange(document, -gamma) 

# merge with dates and average over date (year)
doc_topics <- merge(doc_topics, document_dates, by = "document") %>% filter(year < 2013)
doc_topics_avg <- doc_topics %>% 
  group_by(topic, year) %>% 
  summarize(gamma = mean(gamma), .groups = 'drop_last') %>% 
  ungroup() %>%
  arrange(year, -gamma)

# per topic word proabilities matrix (beta)
topic_term <- tidy(tm, matrix = "beta") 

# top words per topic
top_words_by_topic <- topic_term %>% 
  group_by(topic) %>% 
  top_n(10, wt = beta) %>% 
  ungroup() %>%
  arrange(topic, -beta)

# top topics per year
top_topics_yearly <- doc_topics_avg %>% 
  group_by(year) %>% 
  top_n(3, wt = gamma) %>% 
  ungroup() %>%
  arrange(year, -gamma)

# plot top 3 topics for 1998
top_topics_yearly$topic[top_topics_yearly$year == 1998]

# plot
fgA5a <- top_words_by_topic %>% filter(topic %in% top_topics_yearly$topic[top_topics_yearly$year == 1998]) %>%
  mutate(topic = factor(topic, levels = as.character(top_topics_yearly$topic[top_topics_yearly$year == 1998])),
         term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = topic)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered() +
  scale_fill_grey() +
  xlab("Per-topic-per-word proability (Beta)") + 
  theme(panel.background = element_blank(),
        axis.text.x = element_text(size=16, hjust = 1, vjust = 0.5, angle = 90),
        axis.text.y = element_text(size=16),
        axis.title.y =  element_blank(),
        axis.title.x = element_text(size=18, vjust = -2),
        strip.text = element_text(size = 18),
        plot.margin = unit(c(1,1,1,1), "cm"))

ggsave(filename = "fgA5a.pdf", plot = fgA5a, height = 12, width = 12, path = './figures/', dpi = 1000)

# --------------------------------
# figure A.5 (b)
# --------------------------------

# per document topic proabilities matrix (gamma)
doc_topics <- tidy(tm, matrix = "gamma") %>% arrange(document, -gamma) 

# merge with dates and average over date (year)
doc_topics <- merge(doc_topics, document_dates, by = "document") %>% filter(year < 2013)
doc_topics_avg <- doc_topics %>% 
  group_by(topic, year) %>% 
  summarize(gamma = mean(gamma), .groups = 'drop_last') %>% 
  ungroup() %>%
  arrange(year, -gamma)


# distribution of top 3 topics
fgA5b <-top_topics_yearly %>%
  group_by(year) %>%
  mutate(topic_order = 1:3) %>%
  ungroup() %>%
  ggplot(aes(x = year, y = gamma, fill = as.character(topic_order))) +
  geom_bar(stat="identity") +
  ylab('Per-document-per-topic probability (Gamma) \n for top three topics each year') +
  xlab('Year') +
  scale_x_continuous(breaks = seq(1998, 2012, by = 1)) +
  expand_limits(x = 1998, y = 0) +
  scale_fill_grey() +
  theme(plot.title = element_text(size=20, margin = margin(t = 20, r = 0, b = 0, l = 0)), panel.background = element_blank(),
        axis.text.x = element_text(size=16, angle = 90, vjust = 0.5),
        axis.text.y = element_text(size=16),
        axis.title.y = element_text(size=18, margin = margin(t = 0, r = 15, b = 0, l = 15)),
        axis.title.x = element_blank(),
        legend.position = "none")

ggsave(filename = "fgA5b.pdf", plot = fgA5b, height = 12, width = 12, path = './figures/', dpi = 1000)

